Mini-Project: SVM & LR Classification

2017-2018 California Department of Education Mathmematics Achievement
Created by An Nguyen, Andy Ho, Jodi Pafford, Tori Wheelis
February 06, 2019
In [107]:
import pandas as pd
import numpy as np
import datetime as dt
import plotly as ply
from sklearn.model_selection import ShuffleSplit as ss
from sklearn.linear_model import LogisticRegression as lr
from sklearn import metrics as mt
from sklearn.preprocessing import StandardScaler as sts
from sklearn.pipeline import Pipeline as pl
from sklearn.svm import SVC
from matplotlib import pyplot as plt
from pandas.tools.plotting import boxplot
from __future__ import print_function as pr

##working object will be read latter on
#rainfall_original = pd.read_csv('weatherAus.csv') 
In [108]:
#functions to find the average value using the bracketing values around the NaN, will only use values if they are 
#   from the same city.
#if NaN is the earliest timepoint for a given city the next timepoint with no NaN will be given instead of the mean
#if NaN is the latest timepoint for a given city the previous timepoint with no NaN will be given instead of the mean

def impute_by_city(cities,variables):
    for c in cities:
        temp = rainfall[rainfall.Location == c] #parse out observations from a single city
        #interate through all observations of the temp data file
        i = min(temp.index)
        while i <= max(temp.index):
            for v in variables:
                if pd.isna(temp[v]).all():
                    pass
                elif pd.isna(temp[v][i]):
                    temp[v][i] = find_mean(temp[v], i)
                    rainfall[v][i] = temp[v][i]
            i = i + 1       

def find_mean(templist, index):
    if index == min(templist.index): #if earliest timepoint take the next value that is not NaN
        return find_top(templist, index)
    elif index == max(templist.index): #if latest timepoint take the previous value that is not NaN
        return find_bottom(templist, index)
    else:
        bottom = find_bottom(templist, index) #find previous non-NaN value
        top = find_top(templist, index) #find next non-NaN value
    if pd.isna(top): #if there are no more non-NaN values return the previous non-NaN value
        return bottom
    else:
        mean = (top + bottom)/2
        return mean

#find previous non-NaN value
def find_bottom(templist, index):
    while pd.isna(templist[index-1]):
        index = index-1
    bottom = templist[index-1]
    return bottom

#find next non-NaN value
#if there are no more non-NaN values return the previous non-NaN value
def find_top(templist, index):
    while pd.isna(templist[index+1]):
        index = index+1
        if index == max(templist.index):
            top = np.nan
            return top
    top = templist[index+1]
    return top          
In [109]:
##can be skipped if rainfall.csv already generated!
#rainfall = rainfall_original.copy()
#rainfall.drop(["RISK_MM"], axis=1, inplace=True) #RISK_MM was used to extrapolate response variable.
#rainfall['Date'] =  pd.to_datetime(rainfall['Date'], format='%Y-%m-%d') #change 'Date' variable to datetime64
#rainfall.dropna(subset=["RainToday"], inplace=True) #drop any observation with no record of rainfall for the day,
                                                    #   cannot be imputed
#rainfall = rainfall.reset_index(drop=True) #reset the index after drops
#rainfall.info()
In [110]:
##can be skipped if rainfall.csv already generated!
##set the cardinal directions to degrees
#directions = {'N':0, 'NNE':22.5, 'NE':45, 'NE':45, 'ENE':67.5, 'E':90, 'ESE':112.5, 'SE':135, 'SSE':157.5, 'S':180,\
#              'SSW':202.5, 'SW':225, 'WSW':247.5, 'W':270, 'WNW':292.5, 'NW':315, 'NNW':337.5}
#cities = rainfall.Location.unique() #get name of all cities in the data frame
#c_variables = [] #variables with continuous values
#d_variables = [] #variables with discreet values
        
#rainfall = rainfall.replace(directions) #replace cardianl direction to their corresponding degrees

##change 'Yes' and 'No' to 1 and 0 respectively
#rainfall.RainToday = rainfall.RainToday=='Yes'
#rainfall.RainToday = rainfall.RainToday.astype(np.int)
#rainfall.RainTomorrow = rainfall.RainTomorrow=='Yes'
#rainfall.RainTomorrow = rainfall.RainTomorrow.astype(np.int)

#for l in list(rainfall):
#    if (rainfall[l].dtypes == 'float64'):
#        c_variables.append(l)
#    else:
#        d_variables.append(l)
In [111]:
##can be skipped if rainfall.csv already generated!
##very expensive, rainfall.csv can be uploaded from work directory
#impute_by_city(cities, c_variables) #impute values to NaN 
#rainfall.to_csv("rainfall.csv", sep=',', index=True) #save to csv for later use
In [112]:
#load pre-generated rainfall.csv file
rainfall = pd.read_csv('rainfall.csv', index_col=0) 
rainfall.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 140787 entries, 0 to 140786
Data columns (total 23 columns):
Date             140787 non-null object
Location         140787 non-null object
MinTemp          140787 non-null float64
MaxTemp          140787 non-null float64
Rainfall         140787 non-null float64
Evaporation      97184 non-null float64
Sunshine         89329 non-null float64
WindGustDir      134862 non-null float64
WindGustSpeed    134862 non-null float64
WindDir9am       140787 non-null float64
WindDir3pm       140787 non-null float64
WindSpeed9am     140787 non-null float64
WindSpeed3pm     140787 non-null float64
Humidity9am      140787 non-null float64
Humidity3pm      140787 non-null float64
Pressure9am      129190 non-null float64
Pressure3pm      129190 non-null float64
Cloud9am         107253 non-null float64
Cloud3pm         107253 non-null float64
Temp9am          140787 non-null float64
Temp3pm          140787 non-null float64
RainToday        140787 non-null int64
RainTomorrow     140787 non-null int64
dtypes: float64(19), int64(2), object(2)
memory usage: 25.8+ MB
In [113]:
rainfall = rainfall.drop(['Evaporation', 'Sunshine'], axis = 1) #dropped variable becuase there are too many NaN's
l = list(rainfall.Location.unique())
rainfall.dropna(subset = list(rainfall), inplace = True) #dropped city becuase variable was not recorded by said city
#check to see which city was droped
for i in l:
    if i not in rainfall.Location.unique():
        print(i)
rainfall = rainfall.drop(['Date', 'Location'], axis = 1) #not needed for prediction
BadgerysCreek
Newcastle
NorahHead
Penrith
Tuggeranong
MountGinini
Nhil
Dartmoor
GoldCoast
Adelaide
Albany
Witchcliffe
SalmonGums
Walpole

Logistic Regression

In [114]:
lr_rainfall = rainfall.copy()
# we want to predict the X and y data as follows:
if 'RainTomorrow' in lr_rainfall:
    y = lr_rainfall['RainTomorrow'].values # get the labels we want
    del lr_rainfall['RainTomorrow'] # get rid of the class label
    x = lr_rainfall.values # use everything else to predict!  
    
# split our data into training and testing splits
num_cv_iterations = 5
num_instances = len(y)
cv_object = ss(n_splits=num_cv_iterations, test_size  = 0.2)
                         
print(cv_object)
ShuffleSplit(n_splits=5, random_state=None, test_size=0.2, train_size=None)
In [115]:
# iterate over the coefficients
column_names = lr_rainfall.columns
weights = []
weights_array = []

scl_obj = sts()
for iter_num, (train_indices, test_indices) in enumerate(cv_object.split(x,y)):
    scl_obj.fit(x[train_indices]) # find scalings for each column that make this zero mean and unit std

    X_train_scaled = scl_obj.transform(x[train_indices]) # apply to training
    X_test_scaled = scl_obj.transform(x[test_indices]) # apply those means and std to the test set (without snooping at the test set values)

    # train the model just as before
    lr_clf = lr(penalty='l2', C=0.05) # get object, the 'C' value is less (can you guess why??)
    lr_clf.fit(X_train_scaled,y[train_indices])  # train object

    y_hat = lr_clf.predict(X_test_scaled) # get test set precitions
    
    acc = mt.accuracy_score(y[test_indices],y_hat)
    conf = mt.confusion_matrix(y[test_indices],y_hat)
    print("")
    print('accuracy:', acc )
    print(conf )

    # sort these attributes and spit them out
    #zip_vars = zip(lr_clf.coef_.T,column_names) # combine attributes
    #zip_vars = sorted(zip_vars)
    zip_vars = pd.Series(lr_clf.coef_[0].T, index=column_names)
    for name, coef in zip_vars.items():
        print(name, 'has weight of', coef) # now print them out
        weights.append(coef)
    weights_array.append(weights)
    weights = []
weights_array = np.array(weights_array)
accuracy: 0.8541892825421888
[[15084   831]
 [ 2124  2227]]
MinTemp has weight of 0.06295914334934742
MaxTemp has weight of 0.034849158798397346
Rainfall has weight of 0.08571115116011255
WindGustDir has weight of 0.046123744314107325
WindGustSpeed has weight of 0.7083020245535737
WindDir9am has weight of -0.10772721892079865
WindDir3pm has weight of 0.08030550143643432
WindSpeed9am has weight of -0.05701437919045632
WindSpeed3pm has weight of -0.25486688209488223
Humidity9am has weight of 0.08857916304843116
Humidity3pm has weight of 1.142901482325841
Pressure9am has weight of 1.1083304725272525
Pressure3pm has weight of -1.526504106385531
Cloud9am has weight of 0.15049990986259254
Cloud3pm has weight of 0.3752804616098779
Temp9am has weight of 0.17084566210438792
Temp3pm has weight of -0.3050544710127873
RainToday has weight of 0.19752541921425334

accuracy: 0.855028125925195
[[15172   799]
 [ 2139  2156]]
MinTemp has weight of 0.073731136653677
MaxTemp has weight of 0.11606319165499902
Rainfall has weight of 0.07461299673054736
WindGustDir has weight of 0.040384328488588014
WindGustSpeed has weight of 0.7016216424064348
WindDir9am has weight of -0.10675406539498458
WindDir3pm has weight of 0.08809875222834829
WindSpeed9am has weight of -0.07526667694561333
WindSpeed3pm has weight of -0.243296511049691
Humidity9am has weight of 0.06785448501888652
Humidity3pm has weight of 1.1508287376453727
Pressure9am has weight of 1.0735528250830841
Pressure3pm has weight of -1.4878636985237745
Cloud9am has weight of 0.1472449046402581
Cloud3pm has weight of 0.3779400410636295
Temp9am has weight of 0.16426165145324256
Temp3pm has weight of -0.38513718947028497
RainToday has weight of 0.2063929656524877

accuracy: 0.8499950656271588
[[15047   855]
 [ 2185  2179]]
MinTemp has weight of 0.09750261894570116
MaxTemp has weight of 0.12277477923528987
Rainfall has weight of 0.08606332866551111
WindGustDir has weight of 0.03264972640924209
WindGustSpeed has weight of 0.714761386688825
WindDir9am has weight of -0.11228917233628095
WindDir3pm has weight of 0.08829329685875298
WindSpeed9am has weight of -0.06505311377792543
WindSpeed3pm has weight of -0.25476916867201366
Humidity9am has weight of 0.08440229718746989
Humidity3pm has weight of 1.138122930183795
Pressure9am has weight of 1.0620734196857393
Pressure3pm has weight of -1.4743316781874245
Cloud9am has weight of 0.16697120750813768
Cloud3pm has weight of 0.36617057434056355
Temp9am has weight of 0.12298742488337844
Temp3pm has weight of -0.37051988019166227
RainToday has weight of 0.19092938641253024

accuracy: 0.8572485937037403
[[15204   763]
 [ 2130  2169]]
MinTemp has weight of 0.07302040723997306
MaxTemp has weight of 0.08458030408696654
Rainfall has weight of 0.09318183560487397
WindGustDir has weight of 0.025232641413779577
WindGustSpeed has weight of 0.7094144323779422
WindDir9am has weight of -0.10336584975259988
WindDir3pm has weight of 0.08902201133876082
WindSpeed9am has weight of -0.07049566426685164
WindSpeed3pm has weight of -0.2543680887498027
Humidity9am has weight of 0.08683201380044682
Humidity3pm has weight of 1.111887843622269
Pressure9am has weight of 1.0465128090905977
Pressure3pm has weight of -1.4589245132638304
Cloud9am has weight of 0.14628419784056057
Cloud3pm has weight of 0.38797471104965753
Temp9am has weight of 0.20236796413247532
Temp3pm has weight of -0.39875262633337
RainToday has weight of 0.19029213810486817

accuracy: 0.8593703740254613
[[15294   823]
 [ 2027  2122]]
MinTemp has weight of 0.06623758424301565
MaxTemp has weight of 0.1004015723854032
Rainfall has weight of 0.08126681846668103
WindGustDir has weight of 0.04051083373041905
WindGustSpeed has weight of 0.7026569498166365
WindDir9am has weight of -0.10312702090533875
WindDir3pm has weight of 0.08154594099614491
WindSpeed9am has weight of -0.06195353207767981
WindSpeed3pm has weight of -0.24465662358298976
Humidity9am has weight of 0.10301831303242426
Humidity3pm has weight of 1.1275368720523697
Pressure9am has weight of 1.0794857055571865
Pressure3pm has weight of -1.490098722167888
Cloud9am has weight of 0.1534189758020189
Cloud3pm has weight of 0.3705258607380597
Temp9am has weight of 0.1936797327253754
Temp3pm has weight of -0.3941834439416913
RainToday has weight of 0.18890144388839897
In [116]:
ply.offline.init_notebook_mode() # run at the start of every notebook

mean_weights = np.mean(weights_array,axis = 0)
std_weights = np.std(weights_array,axis = 0)
final_array = pd.DataFrame(data={'mean':mean_weights, 'std':std_weights}, index = column_names)
final_array = final_array.sort_values(by=['mean'])

error_y=dict(
            type='data',
            array=final_array['std'].values,
            visible=True
        )

graph1 = {'x': final_array.index,
          'y': final_array['mean'].values,
    'error_y':error_y,
       'type': 'bar'}

fig = dict()
fig['data'] = [graph1]
fig['layout'] = {'title': 'Logistic Regression Weights, with error bars'}

ply.offline.iplot(fig)

#grab coefficents with absolute values greater than user determined ammount
cutoff = 0.5
lr_voi = []
for index, columns in final_array.iterrows():
    if (columns['mean'] > cutoff) or (columns['mean'] < -cutoff):
        lr_voi.append(index)
In [117]:
lr_rainfall['RainTomorrow'] = y #add it back in for the original data
# now lets see the statistics of these attributes

df_grouped = lr_rainfall.groupby(['RainTomorrow'])

# plot KDE of Different variables
vars_to_plot = lr_voi

for v in vars_to_plot:
    plt.figure(figsize=(10,4))
    # plot original distributions
    plt.subplot(1,2,2)
    ax = df_grouped[v].plot.kde() 
    plt.legend(['no rain','rained'])
    plt.title(v+' (Original)')

Support Vector Machines

In [118]:
svm_rainfall = rainfall.copy()
# we want to predict the X and y data as follows:
if 'RainTomorrow' in lr_rainfall:
    y = svm_rainfall['RainTomorrow'].values # get the labels we want
    del svm_rainfall['RainTomorrow'] # get rid of the class label
    x = svm_rainfall.values # use everything else to predict!  
    
# split our data into training and testing splits
num_cv_iterations = 5
num_instances = len(y)
cv_object = ss(n_splits=num_cv_iterations, test_size  = 0.2)
                         
print(cv_object)
ShuffleSplit(n_splits=5, random_state=None, test_size=0.2, train_size=None)
In [119]:
weights = []
weights_array = []

# okay, so run through the cross validation loop and set the training and testing variable for one single iteration
for train_indices, test_indices in cv_object.split(x,y): 
    # I will create new variables here so that it is more obvious what 
    # the code is doing (you can compact this syntax and avoid duplicating memory,
    # but it makes this code less readable)
    X_train = x[train_indices]
    y_train = y[train_indices]
    
    X_test = x[test_indices]
    y_test = y[test_indices]
    
    X_train_scaled = scl_obj.transform(X_train) # apply to training
    X_test_scaled = scl_obj.transform(X_test) 
    
    #train the model just as before
    svm_clf = SVC(C=0.5, kernel='linear', degree=3, gamma='auto') # get object
    svm_clf.fit(X_train_scaled, y_train)  # train object

    y_hat = svm_clf.predict(X_test_scaled) # get test set precitions

    acc = mt.accuracy_score(y_test,y_hat)
    conf = mt.confusion_matrix(y_test,y_hat)
    print("")
    print('accuracy:', acc )
    print(conf)
    
    # sort these attributes and spit them out
    zip_vars = pd.Series(svm_clf.coef_[0],index=column_names) # combine attributes
    for name, coef in zip_vars.items():
        print(name, 'has weight of', coef) # now print them out
        weights.append(coef)
    weights_array.append(weights)
    weights = []
weights_array = np.array(weights_array)
accuracy: 0.8524622520477647
[[15199   693]
 [ 2297  2077]]
MinTemp has weight of -0.030242311095321384
MaxTemp has weight of 0.26825693906209835
Rainfall has weight of 0.10439140563255478
WindGustDir has weight of 0.01721429264875951
WindGustSpeed has weight of 0.47064724756933174
WindDir9am has weight of -0.07235551677806029
WindDir3pm has weight of 0.07792349953990652
WindSpeed9am has weight of -0.026807386407028844
WindSpeed3pm has weight of -0.2061833513463398
Humidity9am has weight of -0.02744668750801793
Humidity3pm has weight of 0.8540658508713932
Pressure9am has weight of 0.7676229358526143
Pressure3pm has weight of -1.0147647815535947
Cloud9am has weight of 0.07107586271786204
Cloud3pm has weight of 0.17050126868434745
Temp9am has weight of -0.004260294008361143
Temp3pm has weight of -0.2631632497998453
RainToday has weight of 0.13437993366505907

accuracy: 0.8551268133820191
[[15308   684]
 [ 2252  2022]]
MinTemp has weight of -0.03493956168699697
MaxTemp has weight of 0.3376189337134292
Rainfall has weight of 0.10456053967959633
WindGustDir has weight of 0.015497045919914854
WindGustSpeed has weight of 0.47388698360430226
WindDir9am has weight of -0.07320920209826909
WindDir3pm has weight of 0.07430202026883137
WindSpeed9am has weight of -0.034930447272529364
WindSpeed3pm has weight of -0.2119562944781137
Humidity9am has weight of -0.014189812966378668
Humidity3pm has weight of 0.8361228538715295
Pressure9am has weight of 0.7454855781661536
Pressure3pm has weight of -0.997088648551653
Cloud9am has weight of 0.06660281525159917
Cloud3pm has weight of 0.17650260688560593
Temp9am has weight of 0.0077566420448107465
Temp3pm has weight of -0.3393123174930679
RainToday has weight of 0.12652945504805757

accuracy: 0.8511299713806375
[[15245   720]
 [ 2297  2004]]
MinTemp has weight of -0.03387683515484241
MaxTemp has weight of 0.2971710759021562
Rainfall has weight of 0.10202943806098119
WindGustDir has weight of 0.01642685211635353
WindGustSpeed has weight of 0.46949628427989865
WindDir9am has weight of -0.06773029001851683
WindDir3pm has weight of 0.07429386283496342
WindSpeed9am has weight of -0.02447867808763249
WindSpeed3pm has weight of -0.22215363544000866
Humidity9am has weight of -0.01606043951301217
Humidity3pm has weight of 0.8354900453764458
Pressure9am has weight of 0.7806150426529257
Pressure3pm has weight of -1.0344883986263085
Cloud9am has weight of 0.07165823567197549
Cloud3pm has weight of 0.1714036498067344
Temp9am has weight of 0.03911192298791377
Temp3pm has weight of -0.3380482144917778
RainToday has weight of 0.13080519949267

accuracy: 0.8530543767887101
[[15253   696]
 [ 2282  2035]]
MinTemp has weight of -0.0189744542990411
MaxTemp has weight of 0.275541364669607
Rainfall has weight of 0.11112111052102591
WindGustDir has weight of 0.014624022391046765
WindGustSpeed has weight of 0.46721792332084533
WindDir9am has weight of -0.07729493661474862
WindDir3pm has weight of 0.07614855179249957
WindSpeed9am has weight of -0.038368450255745756
WindSpeed3pm has weight of -0.19771768660837097
Humidity9am has weight of -0.030093249115907383
Humidity3pm has weight of 0.8334121145203426
Pressure9am has weight of 0.7405690845853314
Pressure3pm has weight of -0.9884691023386267
Cloud9am has weight of 0.07704307777407848
Cloud3pm has weight of 0.17001978225903258
Temp9am has weight of 0.010527623390757412
Temp3pm has weight of -0.3034392053657484
RainToday has weight of 0.12975026923913902

accuracy: 0.8549787821967828
[[15309   726]
 [ 2213  2018]]
MinTemp has weight of -0.02723793428128829
MaxTemp has weight of 0.27036937935292826
Rainfall has weight of 0.11548394099975212
WindGustDir has weight of 0.012195965753392102
WindGustSpeed has weight of 0.4721578040919212
WindDir9am has weight of -0.07777274239992948
WindDir3pm has weight of 0.0794184324841325
WindSpeed9am has weight of -0.03573186649384752
WindSpeed3pm has weight of -0.20912735565195817
Humidity9am has weight of -0.0107359881294542
Humidity3pm has weight of 0.8154471828497663
Pressure9am has weight of 0.7748278691660744
Pressure3pm has weight of -1.0199515081222899
Cloud9am has weight of 0.07529751873948953
Cloud3pm has weight of 0.1675737905557071
Temp9am has weight of 0.049274358660881035
Temp3pm has weight of -0.3307124105000412
RainToday has weight of 0.1275980231569065
In [120]:
# look at the support vectors
print(svm_clf.support_vectors_.shape)
print(svm_clf.support_.shape)
print(svm_clf.n_support_ )
(28378, 18)
(28378,)
[14194 14184]
In [121]:
ply.offline.init_notebook_mode() # run at the start of every notebook

mean_weights = np.mean(weights_array,axis = 0)
std_weights = np.std(weights_array,axis = 0)
final_array = pd.DataFrame(data={'mean':mean_weights, 'std':std_weights}, index = column_names)
final_array = final_array.sort_values(by=['mean'])

error_y=dict(
            type='data',
            array=final_array['std'].values,
            visible=True
        )

graph1 = {'x': final_array.index,
          'y': final_array['mean'].values,
    'error_y':error_y,
       'type': 'bar'}

fig = dict()
fig['data'] = [graph1]
fig['layout'] = {'title': 'Support Vector Machines Weights, with error bars'}

ply.offline.iplot(fig)

#grab coefficents with absolute values greater than user determined ammount
cutoff = 0.5
svm_voi = []
for index, columns in final_array.iterrows():
    if (columns['mean'] > cutoff) or (columns['mean'] < -cutoff):
        svm_voi.append(index)
In [122]:
# make a dataframe of the training data
df_tested_on = svm_rainfall.iloc[train_indices] # saved from above, the indices chosen for training
# now get the support vectors from the trained model
df_support = df_tested_on.iloc[svm_clf.support_,:]

df_support['RainTomorrow'] = y[svm_clf.support_] # add back in the 'Survived' Column to the pandas dataframe
svm_rainfall['RainTomorrow'] = y # also add it back in for the original data
df_support.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 28378 entries, 99424 to 31373
Data columns (total 19 columns):
MinTemp          28378 non-null float64
MaxTemp          28378 non-null float64
Rainfall         28378 non-null float64
WindGustDir      28378 non-null float64
WindGustSpeed    28378 non-null float64
WindDir9am       28378 non-null float64
WindDir3pm       28378 non-null float64
WindSpeed9am     28378 non-null float64
WindSpeed3pm     28378 non-null float64
Humidity9am      28378 non-null float64
Humidity3pm      28378 non-null float64
Pressure9am      28378 non-null float64
Pressure3pm      28378 non-null float64
Cloud9am         28378 non-null float64
Cloud3pm         28378 non-null float64
Temp9am          28378 non-null float64
Temp3pm          28378 non-null float64
RainToday        28378 non-null int64
RainTomorrow     28378 non-null int64
dtypes: float64(17), int64(2)
memory usage: 4.3 MB
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:6: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [123]:
# group the original data and the support vectors
df_grouped_support = df_support.groupby(['RainTomorrow'])
df_grouped = svm_rainfall.groupby(['RainTomorrow'])

# plot KDE of Different variables
vars_to_plot = svm_voi

for v in vars_to_plot:
    plt.figure(figsize=(10,4))
    # plot support vector stats
    plt.subplot(1,2,1)
    ax = df_grouped_support[v].plot.kde() 
    plt.legend(['no rain','rained'])
    plt.title(v+' (Instances chosen as Support Vectors)')
    
    # plot original distributions
    plt.subplot(1,2,2)
    ax = df_grouped[v].plot.kde() 
    plt.legend(['no rain','rained'])
    plt.title(v+' (Original)')